import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
data = pd.read_csv("data.csv")
print(data.shape)
data.head(3)
data.tail(3)
data.info()
data.describe()
data.describe().T
#Check for Missing Values
plt.figure(figsize=(16,12))
sns.heatmap(data.isnull(), cmap = 'magma')
#Check if there are any duplicate rows
data.duplicated(keep=False).sum()
print("Column names before renaming","\n", data.columns[:5],"\n")
data.columns = data.columns.str.strip()
data.columns = data.columns.str.replace(" " ,"_")
data.rename(columns = {'Bankrupt?' :'Bankrupt' },inplace=True)
print("Column names after renaming","\n",data.columns[:5])
data.duplicated(keep=False).sum() ## no duplicates
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
columns=data.columns
columns
df=data
df
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(data)
var_thres.get_support()
sum(var_thres.get_support())
len(data.columns[var_thres.get_support()])
constant_columns = [column for column in data.columns
if column not in data.columns[var_thres.get_support()]]
print(len(constant_columns))
for column in constant_columns:
print(column)
data=data.drop(constant_columns,axis=1)
import seaborn as sns
#Using Pearson Correlation
corrmat = data.corr()
fig, ax = plt.subplots()
fig.set_size_inches(11,11)
sns.heatmap(corrmat)
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of correlated columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
return col_corr
corr_features = correlation(data, 0.9)
len(set(corr_features))
corr_features
data=data.drop(corr_features,axis=1)
data.shape
data['Bankrupt'].unique()
### Train test split to avoid overfitting
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data.drop(labels=['Bankrupt'], axis=1),
df['Bankrupt'],
test_size=0.3,
random_state=0)
from sklearn.feature_selection import mutual_info_classif
# determine the mutual information
mutual_info = mutual_info_classif(X_train, y_train)
mutual_info
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 8))
from sklearn.feature_selection import SelectKBest
sel_five_cols = SelectKBest(mutual_info_classif, k=70)
sel_five_cols.fit(X_train, y_train)
c=X_train.columns[sel_five_cols.get_support()]
df=df[c]
df.shape
df.head()
df = pd.concat([df,data['Bankrupt']], axis=1).reindex(df.index)
df.shape
df['Bankrupt']
data=df
First separate all 71 features into two groups
49 features are fraction-only features where as 24 are other than fraction-only features.
Outliers are mainly present in these 21 "other than fraction-only" features
To explore the outliers nature, distribution of these 24 features, are obtained using :
## fn to separate only-fractional & other columns
def get_fraction_valued_columns(df):
my_columns = []
for col in df.columns:
if (data[col].max()<=1) & (data[col].min() >= 0):
my_columns.append(col)
return(my_columns)
fractional_columns = get_fraction_valued_columns(df=data.drop(['Bankrupt'],axis=1))
non_fraction_columns = data.drop(['Bankrupt'],axis=1).columns.difference(fractional_columns)
print("# Fraction-only Columns",len(fractional_columns),"\t","# Other than Fraction-only Columns", len(non_fraction_columns))
data[non_fraction_columns].hist(figsize= (20,20),sharex=True,layout= (6,4))
plt.show()
data[non_fraction_columns].boxplot(vert=False,figsize= (15,10))
plt.subplots_adjust(left=0.25)
plt.show()
log_transformed_cols = []
for col in data[non_fraction_columns].columns:
if (data[col].quantile(1) >= 100* data[col].quantile(0.99)) | (sum(data[col] > data[col].quantile(0.99)) <= 10):
data[col] = np.log1p(data[col])
log_transformed_cols.append(col)
## Change names of log transformed column
log_names = "log_" + data[log_transformed_cols].columns
data.rename(columns={data[log_transformed_cols].columns[i]: log_names[i] for i in range(len(log_names))}, inplace = True)
print("The following features are log transformed after they fulfill outlier detection condition.","\n\n",log_transformed_cols)
data[log_names].boxplot(vert=False,figsize= (15,10))
plt.subplots_adjust(left=0.25)
plt.title("Boxplot of Outlier infected features after log transformation")
plt.show()
df1 = pd.DataFrame(data.Bankrupt.value_counts())
df2 = pd.DataFrame(100*data.Bankrupt.value_counts(normalize=True).astype(float))
tab = df1.merge(df2,left_index=True,right_index=True).rename(columns = {"Bankrupt_x" : "Count" , "Bankrupt_y" : "Percentage"})
print(tab)
plt.pie(tab['Count'], labels= [0,1])
data.to_csv('New_data.csv')
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
df = AV.AutoViz('New_data.csv')
# EDA using Sweetviz
import sweetviz as sv
sweet_report = sv.analyze(data)
sweet_report.show_html('sweet_report.html')
y = data['Bankrupt']
X = data.drop(['Bankrupt'], axis = 1)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2 , mutual_info_classif
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X,y)
y_sm.value_counts()
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state = 101, stratify = y_sm)
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.model_selection import GridSearchCV
model_params = {
'svm': {
'model': SVC(gamma='auto',probability=True),
'params' : {
'C': [1,10,20],
'kernel': ['rbf','linear','sigmoid']
}
},
'random_forest': {
'model': RandomForestClassifier(),
'params' : {
'n_estimators': [1,5,10,20,30,50],
'criterion':['gini','entropy']
}
},
'logistic_regression' : {
'model': LogisticRegression(multi_class='auto'),
'params': {
'C': [1,5,10],
'solver':['lbfgs','liblinear']
}
},
'KNN': {
'model':KNeighborsClassifier(),
'params':{
'n_neighbors' : [1,3,5,7],
'algorithm':['auto','kd_tree']
}
}
}
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_sm)
scores = []
for model_name, mp in model_params.items():
clf = GridSearchCV(mp['model'], mp['params'], cv=3, verbose=3, n_jobs=-1, scoring = 'recall', return_train_score=False)
clf.fit(X_scaled, y_sm)
scores.append({
'model': model_name,
'best_score': clf.best_score_,
'best_params': clf.best_params_
})
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df
model_svm = SVC(gamma='auto',C=20, kernel='rbf', probability=True )
model_svm.fit(X_train,y_train)
svm_predictions = model_svm.predict(X_test)
#Printing Confusion Matrix
pd.DataFrame(confusion_matrix(y_test,svm_predictions))
print(classification_report(y_test,svm_predictions))
rfc = RandomForestClassifier(n_estimators=30, criterion='entropy' )
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
#Printing Confusion Matrix
pd.DataFrame(confusion_matrix(y_test,rfc_pred))
print(classification_report(y_test,rfc_pred))
logmodel = LogisticRegression(solver='liblinear',multi_class='auto', C = 10)
logmodel.fit(X_train,y_train)
log_predictions = logmodel.predict(X_test)
pd.DataFrame(confusion_matrix(y_test,log_predictions))
print(classification_report(y_test,log_predictions))
knn = KNeighborsClassifier(n_neighbors=3,algorithm = 'auto')
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)
pd.DataFrame(confusion_matrix(y_test,knn_pred))
print(classification_report(y_test,knn_pred))
from sklearn.metrics import roc_curve, auc
ROC - Logistic
y_pred_logistic = logmodel.predict_proba(X_test)[:,1]
logistic_fpr, logistic_tpr, threshold = roc_curve(y_test, y_pred_logistic)
auc_logistic = auc(logistic_fpr, logistic_tpr)
ROC - Random Forest
y_pred_rfc = rfc.predict_proba(X_test)[:,1]
rfc_fpr, rfc_tpr, threshold = roc_curve(y_test, y_pred_rfc)
auc_rfc = auc(rfc_fpr, rfc_tpr)
ROC - SVM
y_pred_svm = model_svm.predict_proba(X_test)[:,1]
svm_fpr, svm_tpr, threshold = roc_curve(y_test, y_pred_svm)
auc_svm = auc(svm_fpr, svm_tpr)
ROC - KNN
y_pred_knn = knn.predict_proba(X_test)[:,1]
knn_fpr, knn_tpr, threshold = roc_curve(y_test, y_pred_knn)
auc_knn = auc(knn_fpr, knn_tpr)
plt.figure(figsize=(5, 5), dpi=100)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(rfc_fpr, rfc_tpr, linestyle='-', label='RFC (auc = %0.3f)' % auc_rfc)
plt.plot(logistic_fpr, logistic_tpr, marker='.', label='Logistic (auc = %0.3f)' % auc_logistic)
plt.plot(svm_fpr, svm_tpr, marker='+', label='SVM (auc = %0.3f)' % auc_svm)
plt.plot(knn_fpr, knn_tpr, linestyle='-', label='KNN (auc = %0.3f)' % auc_knn)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='best')
import pickle
filename = 'SVM-model.pkl'
pickle.dump(model_svm, open(filename, 'wb'))
import pickle
filename = 'Logistic-model.pkl'
pickle.dump(logmodel, open(filename, 'wb'))
import pickle
filename = 'KNN-model.pkl'
pickle.dump(knn, open(filename, 'wb'))
import pickle
filename = 'Random-Forest-model.pkl'
pickle.dump(rfc, open(filename, 'wb'))
pip install numpy